{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "947c906f-136a-4f3c-9345-c57f0ed34124",
   "metadata": {},
   "source": [
    "# 划分训练-测试集\n",
    "\n",
    "张子豪 2023-2-22 6-9"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3534cc49-1636-4e30-ac03-215f583b0f2d",
   "metadata": {},
   "source": [
    "## 导入工具包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "11c7d96b-19a4-4236-bf1f-2977f2dcba99",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import shutil\n",
    "import random\n",
    "\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f73e65bb-899a-4c73-b06d-3d03c9d6ec63",
   "metadata": {},
   "source": [
    "## 指定数据集路径"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "78a5616c-8f14-459b-88a9-22e6ed0ced01",
   "metadata": {},
   "outputs": [],
   "source": [
    "Dataset_Path = 'Watermelon87_Semantic_Seg_Labelme'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b01aad73-b12d-4ad4-835f-9f2185a92911",
   "metadata": {},
   "source": [
    "## 查看数据集目录结构"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "6a712788-3796-474c-af61-cead5276b087",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📁 Watermelon87_Semantic_Seg_Labelme/\n",
      "├─📁 img_dir/\n",
      "└─📁 ann_dir/\n"
     ]
    }
   ],
   "source": [
    "import seedir as sd\n",
    "sd.seedir(Dataset_Path, style='emoji', depthlimit=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f2547a1f-4a38-4d56-a656-e86062a0e951",
   "metadata": {},
   "source": [
    "## 创建文件夹"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ab791fbe-21dc-405f-9072-b157e9047686",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.chdir(Dataset_Path)\n",
    "os.mkdir('train')\n",
    "os.mkdir('val')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f112267b-1044-45aa-9ad2-f2752a4b80e9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "87"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(os.listdir('img_dir'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "90fe8fd3-2a2c-4e4c-a065-3352eb363eb4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "87"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(os.listdir('ann_dir'))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b04360b9-c43b-4a01-a093-8908be5ea0a8",
   "metadata": {},
   "source": [
    "## 删除系统自动生成的多余文件"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f049f6ef-d54d-4d1d-83e7-a8d23bcb85a0",
   "metadata": {},
   "source": [
    "### 查看待删除的多余文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "23c17bcc-bcbc-4ed1-a829-1b63e99d2b80",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '__MACOSX'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "69da9d50-11d1-4071-b818-dbb065a6474e",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '.DS_Store'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "d4d6ca60-2948-4105-b635-e0d390814067",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '.ipynb_checkpoints'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fedea526-5aac-42a6-b058-61b4ae4afe81",
   "metadata": {},
   "source": [
    "### 删除多余文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "8d959813-5703-4e01-ba3e-f27b207d59f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "!for i in `find . -iname '__MACOSX'`; do rm -rf $i;done"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "d2ea0bef-6954-44d5-8478-680c8d7eded1",
   "metadata": {},
   "outputs": [],
   "source": [
    "!for i in `find . -iname '.DS_Store'`; do rm -rf $i;done"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "7a2b568c-2685-4a13-90f6-185f60066360",
   "metadata": {},
   "outputs": [],
   "source": [
    "!for i in `find . -iname '.ipynb_checkpoints'`; do rm -rf $i;done"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c0f41291-369b-4dd2-8265-8b54a980f6ea",
   "metadata": {},
   "source": [
    "### 验证多余文件已删除"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "2dea6d10-e56f-4775-a055-933d09a9adde",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '__MACOSX'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "23d47236-e846-42bd-9856-a0dd07a23621",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '.DS_Store'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "a85571be-399b-4e6e-b00e-f3fea415a17e",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '.ipynb_checkpoints'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "78005455-4d7b-4c61-8197-0440c6d72069",
   "metadata": {},
   "source": [
    "## 在图像文件夹中，划分训练集和测试集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "9559d312-7353-47e8-8207-84f31a229b9c",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_frac = 0.2  # 测试集比例\n",
    "random.seed(123) # 随机数种子，便于复现"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "02992d76-09cf-43ac-8d39-03f18b04a12a",
   "metadata": {},
   "outputs": [],
   "source": [
    "folder = 'img_dir'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "9edb977a-5a88-45ff-923d-0865c8b1421c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "数据集文件总数 87\n",
      "训练集文件个数 70\n",
      "测试集文件个数 17\n"
     ]
    }
   ],
   "source": [
    "img_paths = os.listdir(folder)\n",
    "random.shuffle(img_paths) # 随机打乱\n",
    "\n",
    "val_number = int(len(img_paths) * test_frac) # 测试集文件个数\n",
    "train_files = img_paths[val_number:]         # 训练集文件名列表\n",
    "val_files = img_paths[:val_number]           # 测试集文件名列表\n",
    "\n",
    "print('数据集文件总数', len(img_paths))\n",
    "print('训练集文件个数', len(train_files))\n",
    "print('测试集文件个数', len(val_files))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "53ab77c4-51e4-4236-9003-a2b4f26fca46",
   "metadata": {},
   "source": [
    "## 将训练集图像移动至`train`目录"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "66bdba3f-4bfb-4865-a427-e330fefec387",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 70/70 [00:00<00:00, 881.71it/s]\n"
     ]
    }
   ],
   "source": [
    "for each in tqdm(train_files):\n",
    "    src_path = os.path.join(folder, each)\n",
    "    dst_path = os.path.join('train', each)\n",
    "    shutil.move(src_path, dst_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "375ab554-433a-42a9-b242-fb7f81283db2",
   "metadata": {},
   "source": [
    "## 将测试集图像移动至`val`目录"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "080bbc39-b982-4dd5-bead-d2c1901b7edf",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 17/17 [00:00<00:00, 909.07it/s]\n"
     ]
    }
   ],
   "source": [
    "for each in tqdm(val_files):\n",
    "    src_path = os.path.join(folder, each)\n",
    "    dst_path = os.path.join('val', each)\n",
    "    shutil.move(src_path, dst_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "13a71eb1-386d-4948-99a5-9fe4b97fbfb4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "87"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(os.listdir('train')) + len(os.listdir('val'))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8e4330f4-caa9-4cce-ab28-cf696ee2fb7f",
   "metadata": {},
   "source": [
    "## 将`train`和`val`剪切至`img_dir`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "1e1eb99f-b1d1-45a9-a6dc-57217f35d092",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'img_dir/val'"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "shutil.move('train', 'img_dir/train')\n",
    "shutil.move('val', 'img_dir/val')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fd58d4c3-c294-4ae7-bf62-bcbda14dac3f",
   "metadata": {},
   "source": [
    "## 在标注文件夹中，划分训练集和测试集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "d2d794f2-9782-4b65-a601-f8cca34e1e92",
   "metadata": {},
   "outputs": [],
   "source": [
    "folder = 'ann_dir'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "9d9ddd45-8f2a-424f-a933-cf0c7766b45a",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.mkdir('train')\n",
    "os.mkdir('val')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d73dc78b-9f5e-4a19-92c2-629e55ac8c19",
   "metadata": {},
   "source": [
    "## 将训练集标注移动至`train`目录"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "db233f50-3dd1-46b4-a9db-0e56356f768a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 70/70 [00:00<00:00, 1349.30it/s]\n"
     ]
    }
   ],
   "source": [
    "for each in tqdm(train_files):\n",
    "    src_path = os.path.join(folder, each.split('.')[0]+'.png')\n",
    "    dst_path = os.path.join('train', each.split('.')[0]+'.png')\n",
    "    shutil.move(src_path, dst_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f049dbd6-4ef0-4b60-8e66-31f5c0ff2c6e",
   "metadata": {},
   "source": [
    "## 将测试集标注移动至`val`目录"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "23ab11de-acdd-4916-864a-779088aae69f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 17/17 [00:00<00:00, 1537.04it/s]\n"
     ]
    }
   ],
   "source": [
    "for each in tqdm(val_files):\n",
    "    src_path = os.path.join(folder, each.split('.')[0]+'.png')\n",
    "    dst_path = os.path.join('val', each.split('.')[0]+'.png')\n",
    "    shutil.move(src_path, dst_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "682adf8f-02e5-418d-92be-8b5a5a09e912",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "87"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(os.listdir('train')) + len(os.listdir('val'))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "214aa22e-9c0e-4a03-9f05-498674c628a6",
   "metadata": {},
   "source": [
    "## 将`train`和`val`剪切至`ann_dir`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "6665f803-4a9d-437d-8e46-c594e4c61a2f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'ann_dir/val'"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "shutil.move('train', 'ann_dir/train')\n",
    "shutil.move('val', 'ann_dir/val')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ecd6fcf0-64f5-4477-85dd-4b38757ee8e2",
   "metadata": {},
   "source": [
    "## 删除系统自动生成的多余文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "31f9447a-648d-43a4-adca-755e3969398c",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.chdir('../')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "741a463d-c631-4e26-bc71-9ee23314338e",
   "metadata": {},
   "source": [
    "### 查看待删除的多余文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "6ece8a4a-6812-43fe-97cf-6320c6b29702",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '__MACOSX'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "734d66a4-8440-487d-8bc7-6a1c5ed2ce9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '.DS_Store'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "60aa36a8-7804-4194-a47b-43013f2404ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '.ipynb_checkpoints'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e94bdc01-6f4e-4909-a875-48e2f3765877",
   "metadata": {},
   "source": [
    "### 删除多余文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "580a96df-baf5-4c80-8efc-5f9cffa08c84",
   "metadata": {},
   "outputs": [],
   "source": [
    "!for i in `find . -iname '__MACOSX'`; do rm -rf $i;done"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "65c8376b-50fb-46ab-9d8b-d9718556bb5c",
   "metadata": {},
   "outputs": [],
   "source": [
    "!for i in `find . -iname '.DS_Store'`; do rm -rf $i;done"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "3cf53f24-fca0-42e9-aa40-878cf280115a",
   "metadata": {},
   "outputs": [],
   "source": [
    "!for i in `find . -iname '.ipynb_checkpoints'`; do rm -rf $i;done"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "944e7e80-0cd6-4b5d-a7fc-ae716a827287",
   "metadata": {},
   "source": [
    "### 验证多余文件已删除"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "440cb364-adf7-439b-98f5-3dedc48e1888",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '__MACOSX'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "940e7fa1-ff61-452d-a900-2383c9a1b658",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '.DS_Store'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "7168766c-75eb-4664-8219-6d35aa755125",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '.ipynb_checkpoints'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b4ddfafc-95c5-4f61-97bc-2aa180673a1c",
   "metadata": {},
   "source": [
    "## 得到划分好训练集测试集的完整语义分割数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a1a86c46-9d74-4953-b7bd-d8e11acf498e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
